data_url <- "https://raw.githubusercontent.com/sushmaakoju/spotify-tracks-data-analysis/main/SpotifyFeatures.csv"
column_names <- c("genre","artist_name","track_name","track_id","popularity","acousticness","danceability","duration_ms","energy","instrumentalness","key","liveness","loudness","mode","speechiness","tempo","time_signature","valence")
spotifydf <- read_csv(data_url, show_col_types=FALSE)
head(spotifydf)
## # A tibble: 6 x 18
## genre artist_name track_name track_id popularity acousticness danceability
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6ga~ 0 0.611 0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nfo~ 1 0.246 0.59
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzoN~ 3 0.952 0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm5~ 0 0.703 0.24
## 5 Movie Fabien Nataf Ouverture 0IuslXpM~ 4 0.95 0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa8~ 0 0.749 0.578
## # ... with 11 more variables: duration_ms <dbl>, energy <dbl>,
## # instrumentalness <dbl>, key <chr>, liveness <dbl>, loudness <dbl>,
## # mode <chr>, speechiness <dbl>, tempo <dbl>, time_signature <chr>,
## # valence <dbl>
colSums(is.na(spotifydf))
## genre artist_name track_name track_id
## 0 0 0 0
## popularity acousticness danceability duration_ms
## 0 0 0 0
## energy instrumentalness key liveness
## 0 0 0 0
## loudness mode speechiness tempo
## 0 0 0 0
## time_signature valence
## 0 0
head( spotifydf)
## # A tibble: 6 x 18
## genre artist_name track_name track_id popularity acousticness danceability
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6ga~ 0 0.611 0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nfo~ 1 0.246 0.59
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzoN~ 3 0.952 0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm5~ 0 0.703 0.24
## 5 Movie Fabien Nataf Ouverture 0IuslXpM~ 4 0.95 0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa8~ 0 0.749 0.578
## # ... with 11 more variables: duration_ms <dbl>, energy <dbl>,
## # instrumentalness <dbl>, key <chr>, liveness <dbl>, loudness <dbl>,
## # mode <chr>, speechiness <dbl>, tempo <dbl>, time_signature <chr>,
## # valence <dbl>
glimpse(spotifydf)
## Rows: 232,725
## Columns: 18
## $ genre <chr> "Movie", "Movie", "Movie", "Movie", "Movie", "Movie",~
## $ artist_name <chr> "Henri Salvador", "Martin & les fées", "Joseph Willia~
## $ track_name <chr> "C'est beau de faire un Show", "Perdu d'avance (par G~
## $ track_id <chr> "0BRjO6ga9RKCKjfDqeFgWV", "0BjC1NfoEOOusryehmNudP", "~
## $ popularity <dbl> 0, 1, 3, 0, 4, 0, 2, 15, 0, 10, 0, 2, 4, 3, 0, 0, 0, ~
## $ acousticness <dbl> 0.61100, 0.24600, 0.95200, 0.70300, 0.95000, 0.74900,~
## $ danceability <dbl> 0.389, 0.590, 0.663, 0.240, 0.331, 0.578, 0.703, 0.41~
## $ duration_ms <dbl> 99373, 137373, 170267, 152427, 82625, 160627, 212293,~
## $ energy <dbl> 0.9100, 0.7370, 0.1310, 0.3260, 0.2250, 0.0948, 0.270~
## $ instrumentalness <dbl> 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.123~
## $ key <chr> "C#", "F#", "C", "C#", "F", "C#", "C#", "F#", "C", "G~
## $ liveness <dbl> 0.3460, 0.1510, 0.1030, 0.0985, 0.2020, 0.1070, 0.105~
## $ loudness <dbl> -1.828, -5.559, -13.879, -12.178, -21.150, -14.970, -~
## $ mode <chr> "Major", "Minor", "Minor", "Major", "Major", "Major",~
## $ speechiness <dbl> 0.0525, 0.0868, 0.0362, 0.0395, 0.0456, 0.1430, 0.953~
## $ tempo <dbl> 166.969, 174.003, 99.488, 171.758, 140.576, 87.479, 8~
## $ time_signature <chr> "4/4", "4/4", "5/4", "4/4", "4/4", "4/4", "4/4", "4/4~
## $ valence <dbl> 0.8140, 0.8160, 0.3680, 0.2270, 0.3900, 0.3580, 0.533~
na.omit(spotifydf)
## # A tibble: 232,725 x 18
## genre artist_name track_name track_id popularity acousticness danceability
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6g~ 0 0.611 0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nf~ 1 0.246 0.59
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzo~ 3 0.952 0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm~ 0 0.703 0.24
## 5 Movie Fabien Nataf Ouverture 0IuslXp~ 4 0.95 0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa~ 0 0.749 0.578
## 7 Movie Martin & le~ Premières r~ 0NUiKYR~ 2 0.344 0.703
## 8 Movie Laura Mayne Let Me Let ~ 0PbIF9Y~ 15 0.939 0.416
## 9 Movie Chorus Helka 0ST6uPf~ 0 0.00104 0.734
## 10 Movie Le Club des~ Les bisous ~ 0VSqZ3K~ 10 0.319 0.598
## # ... with 232,715 more rows, and 11 more variables: duration_ms <dbl>,
## # energy <dbl>, instrumentalness <dbl>, key <chr>, liveness <dbl>,
## # loudness <dbl>, mode <chr>, speechiness <dbl>, tempo <dbl>,
## # time_signature <chr>, valence <dbl>
genres <- distinct(spotifydf, genre)$genre
genres
## [1] "Movie" "R&B" "A Capella" "Alternative"
## [5] "Country" "Dance" "Electronic" "Anime"
## [9] "Folk" "Blues" "Opera" "Hip-Hop"
## [13] "Children's Music" "Children’s Music" "Rap" "Indie"
## [17] "Classical" "Pop" "Reggae" "Reggaeton"
## [21] "Jazz" "Rock" "Ska" "Comedy"
## [25] "Soul" "Soundtrack" "World"
spotifydf$genre[spotifydf$genre=="Children's Music"] <- "Children’s Music"
spotifydf[!duplicated(spotifydf$track_id),]
## # A tibble: 176,774 x 18
## genre artist_name track_name track_id popularity acousticness danceability
## <chr> <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Movie Henri Salva~ C'est beau ~ 0BRjO6g~ 0 0.611 0.389
## 2 Movie Martin & le~ Perdu d'ava~ 0BjC1Nf~ 1 0.246 0.59
## 3 Movie Joseph Will~ Don't Let M~ 0CoSDzo~ 3 0.952 0.663
## 4 Movie Henri Salva~ Dis-moi Mon~ 0Gc6TVm~ 0 0.703 0.24
## 5 Movie Fabien Nataf Ouverture 0IuslXp~ 4 0.95 0.331
## 6 Movie Henri Salva~ Le petit so~ 0Mf1jKa~ 0 0.749 0.578
## 7 Movie Martin & le~ Premières r~ 0NUiKYR~ 2 0.344 0.703
## 8 Movie Laura Mayne Let Me Let ~ 0PbIF9Y~ 15 0.939 0.416
## 9 Movie Chorus Helka 0ST6uPf~ 0 0.00104 0.734
## 10 Movie Le Club des~ Les bisous ~ 0VSqZ3K~ 10 0.319 0.598
## # ... with 176,764 more rows, and 11 more variables: duration_ms <dbl>,
## # energy <dbl>, instrumentalness <dbl>, key <chr>, liveness <dbl>,
## # loudness <dbl>, mode <chr>, speechiness <dbl>, tempo <dbl>,
## # time_signature <chr>, valence <dbl>
genres <- distinct(spotifydf, genre)$genre
unique(spotifydf$key)
## [1] "C#" "F#" "C" "F" "G" "E" "D#" "G#" "D" "A#" "A" "B"
unique(as.numeric(as.factor(spotifydf$key)))
## [1] 5 10 4 9 11 8 7 12 6 2 1 3
spotifydf$key <- as.numeric(as.factor(spotifydf$key))
unique(spotifydf$mode)
## [1] "Major" "Minor"
unique(as.numeric(as.factor(spotifydf$mode)))
## [1] 1 2
spotifydf$mode <- as.numeric(as.factor(spotifydf$mode))
str(spotifydf)
## spec_tbl_df [232,725 x 18] (S3: spec_tbl_df/tbl_df/tbl/data.frame)
## $ genre : chr [1:232725] "Movie" "Movie" "Movie" "Movie" ...
## $ artist_name : chr [1:232725] "Henri Salvador" "Martin & les fées" "Joseph Williams" "Henri Salvador" ...
## $ track_name : chr [1:232725] "C'est beau de faire un Show" "Perdu d'avance (par Gad Elmaleh)" "Don't Let Me Be Lonely Tonight" "Dis-moi Monsieur Gordon Cooper" ...
## $ track_id : chr [1:232725] "0BRjO6ga9RKCKjfDqeFgWV" "0BjC1NfoEOOusryehmNudP" "0CoSDzoNIKCRs124s9uTVy" "0Gc6TVm52BwZD07Ki6tIvf" ...
## $ popularity : num [1:232725] 0 1 3 0 4 0 2 15 0 10 ...
## $ acousticness : num [1:232725] 0.611 0.246 0.952 0.703 0.95 0.749 0.344 0.939 0.00104 0.319 ...
## $ danceability : num [1:232725] 0.389 0.59 0.663 0.24 0.331 0.578 0.703 0.416 0.734 0.598 ...
## $ duration_ms : num [1:232725] 99373 137373 170267 152427 82625 ...
## $ energy : num [1:232725] 0.91 0.737 0.131 0.326 0.225 0.0948 0.27 0.269 0.481 0.705 ...
## $ instrumentalness: num [1:232725] 0 0 0 0 0.123 0 0 0 0.00086 0.00125 ...
## $ key : num [1:232725] 5 10 4 5 9 5 5 10 4 11 ...
## $ liveness : num [1:232725] 0.346 0.151 0.103 0.0985 0.202 0.107 0.105 0.113 0.0765 0.349 ...
## $ loudness : num [1:232725] -1.83 -5.56 -13.88 -12.18 -21.15 ...
## $ mode : num [1:232725] 1 2 2 1 1 1 1 1 1 1 ...
## $ speechiness : num [1:232725] 0.0525 0.0868 0.0362 0.0395 0.0456 0.143 0.953 0.0286 0.046 0.0281 ...
## $ tempo : num [1:232725] 167 174 99.5 171.8 140.6 ...
## $ time_signature : chr [1:232725] "4/4" "4/4" "5/4" "4/4" ...
## $ valence : num [1:232725] 0.814 0.816 0.368 0.227 0.39 0.358 0.533 0.274 0.765 0.718 ...
## - attr(*, "spec")=
## .. cols(
## .. genre = col_character(),
## .. artist_name = col_character(),
## .. track_name = col_character(),
## .. track_id = col_character(),
## .. popularity = col_double(),
## .. acousticness = col_double(),
## .. danceability = col_double(),
## .. duration_ms = col_double(),
## .. energy = col_double(),
## .. instrumentalness = col_double(),
## .. key = col_character(),
## .. liveness = col_double(),
## .. loudness = col_double(),
## .. mode = col_character(),
## .. speechiness = col_double(),
## .. tempo = col_double(),
## .. time_signature = col_character(),
## .. valence = col_double()
## .. )
## - attr(*, "problems")=<externalptr>
columns <- colnames(spotifydf)
columns
## [1] "genre" "artist_name" "track_name" "track_id"
## [5] "popularity" "acousticness" "danceability" "duration_ms"
## [9] "energy" "instrumentalness" "key" "liveness"
## [13] "loudness" "mode" "speechiness" "tempo"
## [17] "time_signature" "valence"
numeric_columns <- unlist(lapply(spotifydf, is.numeric))
numeric_columns
## genre artist_name track_name track_id
## FALSE FALSE FALSE FALSE
## popularity acousticness danceability duration_ms
## TRUE TRUE TRUE TRUE
## energy instrumentalness key liveness
## TRUE TRUE TRUE TRUE
## loudness mode speechiness tempo
## TRUE TRUE TRUE TRUE
## time_signature valence
## FALSE TRUE
numeric_spotifydf <- spotifydf[,numeric_columns]
colnames(numeric_spotifydf)
## [1] "popularity" "acousticness" "danceability" "duration_ms"
## [5] "energy" "instrumentalness" "key" "liveness"
## [9] "loudness" "mode" "speechiness" "tempo"
## [13] "valence"
corr_df <- correlate(numeric_spotifydf, quiet = TRUE)
corr_df
## # A tibble: 13 x 14
## term popularity acousticness danceability duration_ms energy
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 popularity NA -0.381 0.257 0.00235 0.249
## 2 acousticness -0.381 NA -0.365 0.0112 -0.726
## 3 danceability 0.257 -0.365 NA -0.126 0.326
## 4 duration_ms 0.00235 0.0112 -0.126 NA -0.0305
## 5 energy 0.249 -0.726 0.326 -0.0305 NA
## 6 instrumentalness -0.211 0.316 -0.365 0.0760 -0.379
## 7 key -0.000943 0.0143 -0.00700 -0.00351 -0.0104
## 8 liveness -0.168 0.0690 -0.0417 0.0238 0.193
## 9 loudness 0.363 -0.690 0.439 -0.0476 0.816
## 10 mode 0.0706 -0.0559 0.0619 0.0116 0.0413
## 11 speechiness -0.151 0.151 0.135 -0.0162 0.145
## 12 tempo 0.0810 -0.238 0.0219 -0.0285 0.229
## 13 valence 0.0601 -0.326 0.547 -0.142 0.437
## # ... with 8 more variables: instrumentalness <dbl>, key <dbl>, liveness <dbl>,
## # loudness <dbl>, mode <dbl>, speechiness <dbl>, tempo <dbl>, valence <dbl>
corr_df %>%
select(-term) %>%
map_dbl(~ mean(., na.rm = TRUE))
## popularity acousticness danceability duration_ms
## 0.014184858 -0.184994439 0.073552896 -0.022415605
## energy instrumentalness key liveness
## 0.107507980 -0.145097040 -0.006138012 0.036819088
## loudness mode speechiness tempo
## 0.088582298 0.012430743 0.046871552 0.014349281
## valence
## 0.069672654
corr_df2 <- cor(numeric_spotifydf)
col3 = hcl.colors(10, "YlOrRd", rev = TRUE)
corr1 <- corrplot(corr_df2, col=col3, method = 'number')
corrplot(corr_df2, order = 'hclust', addrect = 2)
corrplot(corr_df2/2, col.lim=c(-0.5, 0.5))
genres_df <- spotifydf %>%
select(popularity, genre)%>%
count(popularity, genre)
by_genres <- spotifydf %>% group_by(genre, popularity)
by_genres %>% summarise(
popularity = mean(popularity),
)
## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.
## # A tibble: 1,802 x 2
## # Groups: genre [26]
## genre popularity
## <chr> <dbl>
## 1 A Capella 0
## 2 A Capella 1
## 3 A Capella 2
## 4 A Capella 3
## 5 A Capella 4
## 6 A Capella 5
## 7 A Capella 6
## 8 A Capella 7
## 9 A Capella 8
## 10 A Capella 9
## # ... with 1,792 more rows
by_genre <- by_genres %>% summarise(n = n())
## `summarise()` has grouped output by 'genre'. You can override using the `.groups` argument.
by_genre %>% summarise(n = sum(n)) %>% filter(n>0)
## # A tibble: 26 x 2
## genre n
## <chr> <int>
## 1 A Capella 119
## 2 Alternative 9263
## 3 Anime 8936
## 4 Blues 9023
## 5 Children’s Music 14756
## 6 Classical 9256
## 7 Comedy 9681
## 8 Country 8664
## 9 Dance 8701
## 10 Electronic 9377
## # ... with 16 more rows
#vtree(genres_df, "genre")
colnames(by_genre)
## [1] "genre" "popularity" "n"
ggplot(genres_df)+
geom_point(aes(x = genre, y= n))
genres_df %>% ggplot(aes(x = genre, y= n))+
geom_line(aes(color = "genre"))
ggplot(data=by_genre,aes(x = genre, y = n, fill=genre)) +
geom_bar(stat="identity", width=0.5, position=position_dodge())+
coord_flip()+
labs(title = "Spotify tracks by Genre in US", y= NULL)
genres_df <- spotifydf %>%
select(-c(artist_name, track_name, track_id, time_signature, key))
spotify_summary <- datasummary_skim(numeric_spotifydf)
spotify_summary
| Unique (#) | Missing (%) | Mean | SD | Min | Median | Max | ||
|---|---|---|---|---|---|---|---|---|
| popularity | 101 | 0 | 41.1 | 18.2 | 0.0 | 43.0 | 100.0 | |
| acousticness | 4734 | 0 | 0.4 | 0.4 | 0.0 | 0.2 | 1.0 | |
| danceability | 1295 | 0 | 0.6 | 0.2 | 0.1 | 0.6 | 1.0 | |
| duration_ms | 70749 | 0 | 235122.3 | 118935.9 | 15387.0 | 220427.0 | 5552917.0 | |
| energy | 2517 | 0 | 0.6 | 0.3 | 0.0 | 0.6 | 1.0 | |
| instrumentalness | 5400 | 0 | 0.1 | 0.3 | 0.0 | 0.0 | 1.0 | |
| key | 12 | 0 | 6.3 | 3.5 | 1.0 | 6.0 | 12.0 | |
| liveness | 1732 | 0 | 0.2 | 0.2 | 0.0 | 0.1 | 1.0 | |
| loudness | 27923 | 0 | -9.6 | 6.0 | -52.5 | -7.8 | 3.7 | |
| mode | 2 | 0 | 1.3 | 0.5 | 1.0 | 1.0 | 2.0 | |
| speechiness | 1641 | 0 | 0.1 | 0.2 | 0.0 | 0.1 | 1.0 | |
| tempo | 78512 | 0 | 117.7 | 30.9 | 30.4 | 115.8 | 242.9 | |
| valence | 1692 | 0 | 0.5 | 0.3 | 0.0 | 0.4 | 1.0 |
spotify_histograms <- numeric_spotifydf[,-c(4)]
plot_num(spotify_histograms)
## Warning: `guides(<scale> = FALSE)` is deprecated. Please use `guides(<scale> =
## "none")` instead.
ggplot(spotifydf, aes(popularity)) +
geom_density()
ggplot(spotifydf, aes(energy)) +
geom_density()
ggplot(spotifydf, aes(danceability)) +
geom_density()
ggplot(spotifydf, aes(key)) +
geom_density()
lmfit = lm(popularity ~ acousticness + danceability + duration_ms +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence, numeric_spotifydf )
summary(lmfit)
##
## Call:
## lm(formula = popularity ~ acousticness + danceability + duration_ms +
## energy + instrumentalness + liveness + loudness + speechiness +
## tempo + valence, data = numeric_spotifydf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -58.034 -10.215 1.413 11.058 57.368
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 55.3882318323 0.3440857483 160.972 < 0.0000000000000002 ***
## acousticness -11.9348863471 0.1547306909 -77.133 < 0.0000000000000002 ***
## danceability 17.7399712075 0.2421276181 73.267 < 0.0000000000000002 ***
## duration_ms 0.0000024569 0.0000002816 8.726 < 0.0000000000000002 ***
## energy -5.6278359232 0.2794090244 -20.142 < 0.0000000000000002 ***
## instrumentalness -4.3105903938 0.1330431074 -32.400 < 0.0000000000000002 ***
## liveness -9.6705519516 0.2011430478 -48.078 < 0.0000000000000002 ***
## loudness 0.7150992187 0.0112616140 63.499 < 0.0000000000000002 ***
## speechiness -8.1081609287 0.2298549766 -35.275 < 0.0000000000000002 ***
## tempo -0.0042749795 0.0011180789 -3.824 0.000132 ***
## valence -13.2258861142 0.1659321212 -79.707 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.92 on 232714 degrees of freedom
## Multiple R-squared: 0.2339, Adjusted R-squared: 0.2339
## F-statistic: 7106 on 10 and 232714 DF, p-value: < 0.00000000000000022
plot(lmfit)
coefficients(lmfit)
## (Intercept) acousticness danceability duration_ms
## 55.388231832270 -11.934886347087 17.739971207543 0.000002456887
## energy instrumentalness liveness loudness
## -5.627835923246 -4.310590393787 -9.670551951580 0.715099218724
## speechiness tempo valence
## -8.108160928731 -0.004274979477 -13.225886114236
lmfit2 = lm(popularity ~ acousticness + danceability +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence + key+mode+(energy * loudness) + (valence * danceability) , numeric_spotifydf )
summary(lmfit2)
##
## Call:
## lm(formula = popularity ~ acousticness + danceability + energy +
## instrumentalness + liveness + loudness + speechiness + tempo +
## valence + key + mode + (energy * loudness) + (valence * danceability),
## data = numeric_spotifydf)
##
## Residuals:
## Min 1Q Median 3Q Max
## -60.60 -10.16 1.42 10.98 57.75
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 48.642916 0.383028 126.996 < 0.0000000000000002 ***
## acousticness -11.869822 0.154208 -76.973 < 0.0000000000000002 ***
## danceability 26.491613 0.379808 69.750 < 0.0000000000000002 ***
## energy -4.806738 0.303869 -15.818 < 0.0000000000000002 ***
## instrumentalness -4.074058 0.133798 -30.449 < 0.0000000000000002 ***
## liveness -9.667267 0.201558 -47.963 < 0.0000000000000002 ***
## loudness 0.627526 0.012114 51.802 < 0.0000000000000002 ***
## speechiness -8.102821 0.232924 -34.787 < 0.0000000000000002 ***
## tempo -0.008507 0.001124 -7.569 0.0000000000000376 ***
## valence 0.836879 0.480117 1.743 0.0813 .
## key 0.038810 0.009514 4.079 0.0000452207135986 ***
## mode 1.895830 0.069785 27.167 < 0.0000000000000002 ***
## energy:loudness 0.269525 0.024068 11.198 < 0.0000000000000002 ***
## danceability:valence -23.230354 0.749642 -30.989 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 15.87 on 232711 degrees of freedom
## Multiple R-squared: 0.2393, Adjusted R-squared: 0.2392
## F-statistic: 5630 on 13 and 232711 DF, p-value: < 0.00000000000000022
models <- list(
"OLS" = lmfit2,
"GLM" = glm(popularity ~ acousticness + danceability +energy + instrumentalness + liveness + loudness + speechiness + tempo + valence + key+mode+(energy * loudness) + (valence * danceability) , data = numeric_spotifydf , family = gaussian)
)
modelsummary(models,
fmt = 1,
estimate = c(
"{estimate} ({std.error})",
"{estimate} [{conf.low}, {conf.high}]"),
statistic = NULL,
coef_omit = "Intercept",
output = "table1.png")
## save_kable will have the best result with magick installed.
modelsummary(models,gof_omit = ".*",
statistic = c("conf.int",
"s.e. = {std.error}",
"t = {statistic}",
"p = {p.value}"),
output = "table2.png")
## save_kable will have the best result with magick installed.
train = sample(1:nrow(numeric_spotifydf), 300)
rf.spotify = randomForest(popularity~., data = numeric_spotifydf, subset = train)
rf.spotify
##
## Call:
## randomForest(formula = popularity ~ ., data = numeric_spotifydf, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 250.2799
## % Var explained: 27.11
The following plot suggests that 50 trees or so is enough to fit the model with RFR.
plot(rf.spotify)
oob.err = double(17)
test.err = double(17)
for(mtry in 1:17){
fit = randomForest(popularity~., data = numeric_spotifydf, subset=train, mtry=mtry, ntree = 350)
oob.err[mtry] = fit$mse[350]
pred = predict(fit, numeric_spotifydf[-train,])
test.err[mtry] = with(numeric_spotifydf[-train,], mean( (popularity-pred)^2 ))
}
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
## Warning in randomForest.default(m, y, ...): invalid mtry: reset to within valid
## range
matplot(1:mtry, cbind(test.err, oob.err), pch = 23, col = c("red", "blue"), type = "b", ylab="Mean Squared Error", lwd=6)
legend("topright", legend = c("OOB", "Test"), pch = 23, col = c("red", "blue"))
oob.err
## [1] 265.3070 252.7247 255.1290 245.8766 254.4760 254.0606 251.6458 249.0730
## [9] 259.7739 248.7766 253.3789 256.4281 252.0817 254.7610 250.2245 246.3955
## [17] 247.8453
test.err
## [1] 250.3874 245.9252 246.6473 247.4580 247.7958 249.3150 251.3474 251.7642
## [9] 254.0246 255.5279 254.0574 253.9808 254.2413 254.9214 254.6739 254.0708
## [17] 253.7098
rf.spotify1 = randomForest(popularity~., data = numeric_spotifydf, subset = train, importance = TRUE)
rf.spotify1
##
## Call:
## randomForest(formula = popularity ~ ., data = numeric_spotifydf, importance = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 254.363
## % Var explained: 25.92
summary(rf.spotify1)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 300 -none- numeric
## mse 500 -none- numeric
## rsq 500 -none- numeric
## oob.times 300 -none- numeric
## importance 24 -none- numeric
## importanceSD 12 -none- numeric
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 300 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
plot(rf.spotify1)
rf.spotify1$importance
## %IncMSE IncNodePurity
## acousticness 83.8293083 16303.9087
## danceability 0.9757459 6666.9743
## duration_ms 36.1802669 13793.4227
## energy 34.0491149 8692.6904
## instrumentalness 14.9096025 6463.0006
## key -0.4889556 3558.2322
## liveness 2.7009406 6646.8233
## loudness 49.3742291 11173.8681
## mode 0.3512303 704.8762
## speechiness 17.3380771 7831.1208
## tempo 7.5762679 7557.1762
## valence 14.3065880 7843.9460
rf.spotify1$importanceSD
## acousticness danceability duration_ms energy
## 3.7964232 1.6640476 2.3834029 2.9363239
## instrumentalness key liveness loudness
## 1.8942150 1.1909310 1.4847242 2.9806934
## mode speechiness tempo valence
## 0.5247077 1.7392226 1.8073582 1.9422187
create_rfplot <- function(rf, type){
imp <- importance(rf, type = type, scale = F)
featureImportance <- data.frame(Feature = row.names(imp), Importance = imp[,1])
p <- ggplot(featureImportance, aes(x = reorder(Feature, Importance), y = Importance)) +
geom_bar(stat = "identity", fill = featureImportance$Importance, width = 0.65) +
coord_flip() +
theme_light(base_size = 25) +
theme(axis.title.x = element_text(size = 20, color = "black"),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 20, color = "black"),
axis.text.y = element_text(size = 20, color = "black"))
return(p)
}
create_rfplot(rf.spotify1, type = 2)
data.frame(Feature = row.names(rf.spotify1$importance), Importance = rf.spotify1$importance[,1])
## Feature Importance
## acousticness acousticness 83.8293083
## danceability danceability 0.9757459
## duration_ms duration_ms 36.1802669
## energy energy 34.0491149
## instrumentalness instrumentalness 14.9096025
## key key -0.4889556
## liveness liveness 2.7009406
## loudness loudness 49.3742291
## mode mode 0.3512303
## speechiness speechiness 17.3380771
## tempo tempo 7.5762679
## valence valence 14.3065880
rf.spotify2 = randomForest(popularity~., data = numeric_spotifydf, subset = train, replace = FALSE, nodesize = 7, keep.forest = TRUE, keep.inbag = TRUE)
rf.spotify2
##
## Call:
## randomForest(formula = popularity ~ ., data = numeric_spotifydf, replace = FALSE, nodesize = 7, keep.forest = TRUE, keep.inbag = TRUE, subset = train)
## Type of random forest: regression
## Number of trees: 500
## No. of variables tried at each split: 4
##
## Mean of squared residuals: 249.9579
## % Var explained: 27.2
permimp <- permimp(rf.spotify2, conditional = TRUE, progressBar = FALSE, do_check=FALSE)
permimp$values
## acousticness danceability duration_ms energy
## 12.5872950 -0.4810545 27.1125698 1.8920315
## instrumentalness key liveness loudness
## 4.8026874 -0.6407343 1.8529868 1.2190515
## mode speechiness tempo valence
## 0.1162108 10.3474736 9.2585995 8.2712024
ggplot(as.data.frame(permimp$values), aes(x = reorder(names(permimp$values)
, as.numeric(permimp$values)), y = as.numeric(permimp$values))) +
geom_bar(stat = "identity", fill = as.factor(seq(0,11)), width = 0.65) +
coord_flip() +
theme_light(base_size = 25) +
theme(axis.title.x = element_text(size = 20, color = "black"),
axis.title.y = element_blank(),
axis.text.x = element_text(size = 20, color = "black"),
axis.text.y = element_text(size = 20, color = "black"))+xlab("Features")+ylab("Importance")